<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
<meta name="viewport" content="width=device-width">
<meta name="theme-color" content="#222"><meta name="generator" content="Hexo 6.2.0">


  <link rel="apple-touch-icon" sizes="180x180" href="https://www.wulihub.com.cn/gc/Wez0VG/images/apple-touch-icon.png">
  <link rel="icon" type="image/png" sizes="32x32" href="https://www.wulihub.com.cn/gc/Wez0VG/images/favicon-32x32.png">
  <link rel="icon" type="image/png" sizes="16x16" href="https://www.wulihub.com.cn/gc/Wez0VG/images/favicon-16x16.png">
  <link rel="mask-icon" href="https://www.wulihub.com.cn/gc/Wez0VG/images/logo.svg" color="#222">

<link rel="stylesheet" href="https://www.wulihub.com.cn/gc/Wez0VG/css/main.css">



<link rel="stylesheet" href="https://npm.elemecdn.com/@fortawesome/fontawesome-free@6.1.1/css/all.min.css" integrity="sha256-DfWjNxDkM94fVBWx1H5BMMp0Zq7luBlV8QRcSES7s+0=" crossorigin="anonymous">
  <link rel="stylesheet" href="https://npm.elemecdn.com/animate.css@3.1.1/animate.min.css" integrity="sha256-PR7ttpcvz8qrF57fur/yAx1qXMFJeJFiA6pSzWi0OIE=" crossorigin="anonymous">

<script class="next-config" data-name="main" type="application/json">{"hostname":"nobige.cn","root":"/","images":"https://www.wulihub.com.cn/gc/Wez0VG/images","scheme":"Gemini","darkmode":false,"version":"8.12.2","exturl":false,"sidebar":{"position":"left","display":"post","padding":18,"offset":12},"copycode":{"enable":false,"style":null},"bookmark":{"enable":false,"color":"#222","save":"auto"},"mediumzoom":false,"lazyload":false,"pangu":false,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},"stickytabs":false,"motion":{"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"fadeInDown","post_body":"fadeInDown","coll_header":"fadeInLeft","sidebar":"fadeInUp"}},"prism":false,"i18n":{"placeholder":"Searching...","empty":"We didn't find any results for the search: ${query}","hits_time":"${hits} results found in ${time} ms","hits":"${hits} results found"},"path":"/search.xml","localsearch":{"enable":true,"trigger":"auto","top_n_per_article":1,"unescape":false,"preload":false}}</script><script src="https://www.wulihub.com.cn/gc/Wez0VG/js/config.js"></script>

    <meta name="description" content="python利用xpath定位问题 以豆瓣top250为例，xpath定位的方式是“先大后小”，即先获取到列表然后再获取单部电影的信息。 列表最一层为ol元素且有class属性，要取列表则是 1selector.xpath(&amp;#x27;&#x2F;&#x2F;ol[@class&#x3D;&quot;grid_view&quot;]&#x2F;li&amp;#x27;)  取完后，单个信息便是简单获取了只要遍历即可，如电影封面图片，电影名字，评">
<meta property="og:type" content="article">
<meta property="og:title" content="python利用xpath定位问题">
<meta property="og:url" content="https://nobige.cn/post/20191031-python_li_yong_xpath_ding_wei_wen_ti/index.html">
<meta property="og:site_name" content="NoBige-JackCh3n">
<meta property="og:description" content="python利用xpath定位问题 以豆瓣top250为例，xpath定位的方式是“先大后小”，即先获取到列表然后再获取单部电影的信息。 列表最一层为ol元素且有class属性，要取列表则是 1selector.xpath(&amp;#x27;&#x2F;&#x2F;ol[@class&#x3D;&quot;grid_view&quot;]&#x2F;li&amp;#x27;)  取完后，单个信息便是简单获取了只要遍历即可，如电影封面图片，电影名字，评">
<meta property="og:locale" content="en_US">
<meta property="og:image" content="https://blog-static.nobige.cn/gh/JackCh3n/n0bige/post/20191031-python_li_yong_xpath_ding_wei_wen_ti/xiao2019-10-31-20-38-51-850.jpg">
<meta property="article:published_time" content="2019-10-31T12:37:04.000Z">
<meta property="article:modified_time" content="2024-11-07T02:59:06.238Z">
<meta property="article:author" content="JackCh3n">
<meta property="article:tag" content="xpath">
<meta property="article:tag" content="lxml">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="https://blog-static.nobige.cn/gh/JackCh3n/n0bige/post/20191031-python_li_yong_xpath_ding_wei_wen_ti/xiao2019-10-31-20-38-51-850.jpg">


<link rel="canonical" href="https://nobige.cn/post/20191031-python_li_yong_xpath_ding_wei_wen_ti/">



<script class="next-config" data-name="page" type="application/json">{"sidebar":"","isHome":false,"isPost":true,"lang":"en","comments":true,"permalink":"https://nobige.cn/post/20191031-python_li_yong_xpath_ding_wei_wen_ti/","path":"post/20191031-python_li_yong_xpath_ding_wei_wen_ti/","title":"python利用xpath定位问题"}</script>

<script class="next-config" data-name="calendar" type="application/json">""</script>
<title>python利用xpath定位问题 | NoBige-JackCh3n</title>
  
  <script class="next-config" data-name="google_analytics" type="application/json">{"tracking_id":"UA-122817633-1","only_pageview":true}</script>
  <script src="https://www.wulihub.com.cn/gc/Wez0VG/js/third-party/analytics/google-analytics.js"></script>

  <script src="https://www.wulihub.com.cn/gc/Wez0VG/js/third-party/analytics/baidu-analytics.js"></script>
  <script async src="https://hm.baidu.com/hm.js?96741f92c07d010c0ddd1d02197893e2"></script>




  <noscript>
    <link rel="stylesheet" href="https://www.wulihub.com.cn/gc/Wez0VG/css/noscript.css">
  </noscript>
</head>

<body itemscope itemtype="http://schema.org/WebPage" class="use-motion">
  <div class="headband"></div>

  <main class="main">
    <header class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-container">
  <div class="site-nav-toggle">
    <div class="toggle" aria-label="Toggle navigation bar" role="button">
        <span class="toggle-line"></span>
        <span class="toggle-line"></span>
        <span class="toggle-line"></span>
    </div>
  </div>

  <div class="site-meta">

    <a href="/" class="brand" rel="start">
      <i class="logo-line"></i>
      <p class="site-title">NoBige-JackCh3n</p>
      <i class="logo-line"></i>
    </a>
      <p class="site-subtitle" itemprop="description">copy to my blog</p>
  </div>

  <div class="site-nav-right">
    <div class="toggle popup-trigger">
        <i class="fa fa-search fa-fw fa-lg"></i>
    </div>
  </div>
</div>



<nav class="site-nav">
  <ul class="main-menu menu"><li class="menu-item menu-item-home"><a href="/" rel="section"><i class="home fa-fw"></i>Home</a></li><li class="menu-item menu-item-about"><a href="/about/" rel="section"><i class="user fa-fw"></i>About</a></li><li class="menu-item menu-item-tags"><a href="/tags/" rel="section"><i class="tags fa-fw"></i>Tags</a></li><li class="menu-item menu-item-categories"><a href="/categories/" rel="section"><i class="th fa-fw"></i>Categories</a></li><li class="menu-item menu-item-archives"><a href="/archives/" rel="section"><i class="archive fa-fw"></i>Archives</a></li><li class="menu-item menu-item-sitemap"><a href="/sitemap.xml" rel="section"><i class="sitemap fa-fw"></i>Sitemap</a></li>
      <li class="menu-item menu-item-search">
        <a role="button" class="popup-trigger"><i class="fa fa-search fa-fw"></i>Search
        </a>
      </li>
  </ul>
</nav>



  <div class="search-pop-overlay">
    <div class="popup search-popup"><div class="search-header">
  <span class="search-icon">
    <i class="fa fa-search"></i>
  </span>
  <div class="search-input-container">
    <input autocomplete="off" autocapitalize="off" maxlength="80"
           placeholder="Searching..." spellcheck="false"
           type="search" class="search-input">
  </div>
  <span class="popup-btn-close" role="button">
    <i class="fa fa-times-circle"></i>
  </span>
</div>
<div class="search-result-container no-result">
  <div class="search-result-icon">
    <i class="fa fa-spinner fa-pulse fa-5x"></i>
  </div>
</div>

    </div>
  </div>

</div>
        
  
  <div class="toggle sidebar-toggle" role="button">
    <span class="toggle-line"></span>
    <span class="toggle-line"></span>
    <span class="toggle-line"></span>
  </div>

  <aside class="sidebar">

    <div class="sidebar-inner sidebar-nav-active sidebar-toc-active">
      <ul class="sidebar-nav">
        <li class="sidebar-nav-toc">
          Table of Contents
        </li>
        <li class="sidebar-nav-overview">
          Overview
        </li>
      </ul>

      <div class="sidebar-panel-container">
        <!--noindex-->
        <div class="post-toc-wrap sidebar-panel">
            <div class="post-toc animated"><ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#python%E5%88%A9%E7%94%A8xpath%E5%AE%9A%E4%BD%8D%E9%97%AE%E9%A2%98"><span class="nav-number">1.</span> <span class="nav-text">python利用xpath定位问题</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#XPath-%E8%AF%AD%E6%B3%95"><span class="nav-number">2.</span> <span class="nav-text">XPath 语法</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#xpath%E8%8E%B7%E5%8F%96%E6%9F%90%E4%B8%AA%E8%8A%82%E7%82%B9%E7%9A%84html%E6%BA%90%E7%A0%81"><span class="nav-number">2.1.</span> <span class="nav-text">xpath获取某个节点的html源码</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#%E9%80%89%E5%8F%96%E8%8A%82%E7%82%B9"><span class="nav-number">2.2.</span> <span class="nav-text">选取节点</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#%E4%B8%8B%E9%9D%A2%E5%88%97%E5%87%BA%E4%BA%86%E6%9C%80%E6%9C%89%E7%94%A8%E7%9A%84%E8%B7%AF%E5%BE%84%E8%A1%A8%E8%BE%BE%E5%BC%8F%EF%BC%9A"><span class="nav-number">2.2.1.</span> <span class="nav-text">下面列出了最有用的路径表达式：</span></a></li></ol></li><li class="nav-item nav-level-3"><a class="nav-link" href="#%E8%B0%93%E8%AF%AD%EF%BC%88Predicates%EF%BC%89"><span class="nav-number">2.3.</span> <span class="nav-text">谓语（Predicates）</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#%E9%80%89%E5%8F%96%E6%9C%AA%E7%9F%A5%E8%8A%82%E7%82%B9"><span class="nav-number">2.4.</span> <span class="nav-text">选取未知节点</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#%E9%80%89%E5%8F%96%E8%8B%A5%E5%B9%B2%E8%B7%AF%E5%BE%84"><span class="nav-number">2.5.</span> <span class="nav-text">选取若干路径</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E5%8F%82%E8%80%83"><span class="nav-number">3.</span> <span class="nav-text">参考</span></a></li></ol></div>
        </div>
        <!--/noindex-->

        <div class="site-overview-wrap sidebar-panel">
          <div class="site-author site-overview-item animated" itemprop="author" itemscope itemtype="http://schema.org/Person">
    <img class="site-author-image" itemprop="image" alt="JackCh3n"
      src="https://www.wulihub.com.cn/gc/Wez0VG/images/avatar.png">
  <p class="site-author-name" itemprop="name">JackCh3n</p>
  <div class="site-description" itemprop="description">Record the pit filling process with words</div>
</div>
<div class="site-state-wrap site-overview-item animated">
  <nav class="site-state">
      <div class="site-state-item site-state-posts">
        <a href="/archives/">
          <span class="site-state-item-count">146</span>
          <span class="site-state-item-name">posts</span>
        </a>
      </div>
      <div class="site-state-item site-state-categories">
          <a href="/categories/">
        <span class="site-state-item-count">42</span>
        <span class="site-state-item-name">categories</span></a>
      </div>
      <div class="site-state-item site-state-tags">
          <a href="/tags/">
        <span class="site-state-item-count">279</span>
        <span class="site-state-item-name">tags</span></a>
      </div>
  </nav>
</div>
  <div class="links-of-author site-overview-item animated">
      <span class="links-of-author-item">
        <a href="https://github.com/jackch3n" title="GitHub → https:&#x2F;&#x2F;github.com&#x2F;jackch3n" rel="noopener" target="_blank"><i class="github fa-fw"></i>GitHub</a>
      </span>
  </div>


  <div class="links-of-blogroll site-overview-item animated">
    <div class="links-of-blogroll-title"><i class="fa fa-globe fa-fw"></i>
      Links
    </div>
    <ul class="links-of-blogroll-list">
        <li class="links-of-blogroll-item">
          <a href="https://vercel.com/?f=nobige" title="https:&#x2F;&#x2F;vercel.com?f&#x3D;nobige" rel="noopener" target="_blank">vercel</a>
        </li>
        <li class="links-of-blogroll-item">
          <a href="https://www.jsdelivr.com/?f=nobige" title="https:&#x2F;&#x2F;www.jsdelivr.com?f&#x3D;nobige" rel="noopener" target="_blank">jsdelivr</a>
        </li>
        <li class="links-of-blogroll-item">
          <a href="https://www.upyun.com/?f=nobige" title="https:&#x2F;&#x2F;www.upyun.com?f&#x3D;nobige" rel="noopener" target="_blank">upyun</a>
        </li>
        <li class="links-of-blogroll-item">
          <a href="https://coding.net/?f=nobige" title="https:&#x2F;&#x2F;coding.net?f&#x3D;nobige" rel="noopener" target="_blank">coding</a>
        </li>
        <li class="links-of-blogroll-item">
          <a href="https://unsplash.com/?f=nobige" title="https:&#x2F;&#x2F;unsplash.com?f&#x3D;nobige" rel="noopener" target="_blank">unsplash</a>
        </li>
    </ul>
  </div>

        </div>
      </div>
    </div>
  </aside>
  <div class="sidebar-dimmer"></div>


    </header>

    
  <div class="back-to-top" role="button" aria-label="Back to top">
    <i class="fa fa-arrow-up"></i>
    <span>0%</span>
  </div>

<noscript>
  <div class="noscript-warning">Theme NexT works best with JavaScript enabled</div>
</noscript>


    <div class="main-inner post posts-expand">


  


<div class="post-block">
  
  

  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="en">
    <link itemprop="mainEntityOfPage" href="https://nobige.cn/post/20191031-python_li_yong_xpath_ding_wei_wen_ti/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="https://www.wulihub.com.cn/gc/Wez0VG/images/avatar.png">
      <meta itemprop="name" content="JackCh3n">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="NoBige-JackCh3n">
      <meta itemprop="description" content="Record the pit filling process with words">
    </span>

    <span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
      <meta itemprop="name" content="python利用xpath定位问题 | NoBige-JackCh3n">
      <meta itemprop="description" content="">
    </span>
      <header class="post-header">
        <h1 class="post-title" itemprop="name headline">
          python利用xpath定位问题
        </h1>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">Posted on</span>

      <time title="Created: 2019-10-31 20:37:04" itemprop="dateCreated datePublished" datetime="2019-10-31T20:37:04+08:00">2019-10-31</time>
    </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar-check"></i>
      </span>
      <span class="post-meta-item-text">Edited on</span>
      <time title="Modified: 2024-11-07 10:59:06" itemprop="dateModified" datetime="2024-11-07T10:59:06+08:00">2024-11-07</time>
    </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-folder"></i>
      </span>
      <span class="post-meta-item-text">In</span>
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/categories/python/" itemprop="url" rel="index"><span itemprop="name">python</span></a>
        </span>
    </span>

  
    <span class="post-meta-item" title="Views" id="busuanzi_container_page_pv">
      <span class="post-meta-item-icon">
        <i class="far fa-eye"></i>
      </span>
      <span class="post-meta-item-text">Views: </span>
      <span id="busuanzi_value_page_pv"></span>
    </span>
</div>

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">
        <h2 id="python利用xpath定位问题"><a href="#python利用xpath定位问题" class="headerlink" title="python利用xpath定位问题"></a>python利用xpath定位问题</h2><p><img src="https://blog-static.nobige.cn/gh/JackCh3n/n0bige/post/20191031-python_li_yong_xpath_ding_wei_wen_ti/xiao2019-10-31-20-38-51-850.jpg" alt="豆瓣top250"></p>
<p>以豆瓣top250为例，xpath定位的方式是“先大后小”，即先获取到列表然后再获取单部电影的信息。</p>
<p>列表最一层为<code>ol</code>元素且有<code>class</code>属性，要取列表则是</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">selector.xpath(<span class="string">&#x27;//ol[@class=&quot;grid_view&quot;]/li&#x27;</span>)</span><br></pre></td></tr></table></figure>

<p>取完后，单个信息便是简单获取了只要遍历即可，如电影封面图片，电影名字，评分。。。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 电影封面图片</span></span><br><span class="line">item.xpath(<span class="string">&#x27;//img/@src&#x27;</span>)[<span class="number">0</span>]</span><br><span class="line"></span><br><span class="line"><span class="comment"># 电影名字</span></span><br><span class="line">item.xpath(<span class="string">&#x27;//div[@class=&quot;hd&quot;]/a/text()&#x27;</span>)</span><br><span class="line"></span><br><span class="line"><span class="comment"># 评分</span></span><br><span class="line">item.xpath(<span class="string">&#x27;//span[@class=&quot;rating_num&quot;]/text()&#x27;</span>)</span><br></pre></td></tr></table></figure>

<p>若到此也是按其xpath的路径来获取的话，获取的<strong>并不是列表里面单个电影的信息，而是网页中全部符合路径的信息</strong>，为什么？原因和简单，其主要问题在于路径定位问题。</p>
<p>若是改成绝对定位也是可以，如下</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 电影封面图片</span></span><br><span class="line">item.xpath(<span class="string">&#x27;//*[@id=&quot;content&quot;]/div/div[/ol/li/div/div[1]/a/img/@src&#x27;</span>)[<span class="number">0</span>]</span><br></pre></td></tr></table></figure>

<p>但是获取列表就没意义，解决办法只需要一个 <code>.</code> 修改之后的定位则是</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 电影封面图片</span></span><br><span class="line">item.xpath(<span class="string">&#x27;.//img/@src&#x27;</span>)[<span class="number">0</span>]</span><br><span class="line"></span><br><span class="line"><span class="comment"># 电影名字</span></span><br><span class="line">item.xpath(<span class="string">&#x27;.//div[@class=&quot;hd&quot;]/a/text()&#x27;</span>)</span><br><span class="line"></span><br><span class="line"><span class="comment"># 评分</span></span><br><span class="line">item.xpath(<span class="string">&#x27;.//span[@class=&quot;rating_num&quot;]/text()&#x27;</span>)</span><br></pre></td></tr></table></figure>

<p>可以理解为，当前路径下开始寻找。</p>
<h2 id="XPath-语法"><a href="#XPath-语法" class="headerlink" title="XPath 语法"></a>XPath 语法</h2><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 获取title</span></span><br><span class="line">item.xpath(<span class="string">&#x27;//title/text()&#x27;</span>)</span><br><span class="line"><span class="comment"># 获取图片链接</span></span><br><span class="line">item.xpath(<span class="string">&#x27;.//img/@src&#x27;</span>)</span><br><span class="line"><span class="comment"># 获取图片描述</span></span><br><span class="line">item.xpath(<span class="string">&#x27;.//img/@alt&#x27;</span>)</span><br><span class="line"><span class="comment"># 获取图片data-src</span></span><br><span class="line">item.xpath(<span class="string">&#x27;.//img/@data-src&#x27;</span>)</span><br><span class="line"><span class="comment"># 其他同理</span></span><br><span class="line"></span><br><span class="line"><span class="comment"># 获取指定节点下的所有节点，包含注释内容</span></span><br><span class="line">item.xpath(<span class="string">&#x27;//div[@class=&quot;jjText&quot;]/node()&#x27;</span>)</span><br><span class="line"><span class="comment"># 再进一步进行提取注释内容</span></span><br><span class="line"></span><br><span class="line"><span class="comment"># 获取全部注释内容</span></span><br><span class="line">item.xpath(<span class="string">&#x27;.//comment()&#x27;</span>)</span><br><span class="line"></span><br><span class="line">/ 根节点选取、或是元素和元素的过度。</span><br><span class="line">// </span><br></pre></td></tr></table></figure>

<h3 id="xpath获取某个节点的html源码"><a href="#xpath获取某个节点的html源码" class="headerlink" title="xpath获取某个节点的html源码"></a>xpath获取某个节点的html源码</h3><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"><span class="keyword">import</span> lxml.html</span><br><span class="line"><span class="keyword">import</span> html</span><br><span class="line">temp_html = requests.get(<span class="string">&#x27;https://www.thepaper.cn/newsDetail_forward_8054822&#x27;</span>).content.decode(<span class="string">&#x27;utf-8&#x27;</span>)</span><br><span class="line">selector = lxml.html.fromstring(temp_html)</span><br><span class="line">temp_txt = selector.xpath(<span class="string">&#x27;//div[@class=&quot;news_txt&quot;]&#x27;</span>)</span><br><span class="line">temp_htmls = <span class="string">&#x27;&#x27;</span></span><br><span class="line"><span class="keyword">for</span> x <span class="keyword">in</span> temp_txt:</span><br><span class="line">    temp_htmls += html.unescape(lxml.html.tostring(x).decode(<span class="string">&#x27;utf-8&#x27;</span>))</span><br><span class="line"><span class="comment"># html.unescape 是html转码,解析,转义</span></span><br><span class="line"><span class="built_in">print</span>(temp_htmls)</span><br></pre></td></tr></table></figure>



<h3 id="选取节点"><a href="#选取节点" class="headerlink" title="选取节点"></a>选取节点</h3><p>XPath 使用路径表达式在 XML 文档中选取节点。节点是通过沿着路径或者 step 来选取的。</p>
<h4 id="下面列出了最有用的路径表达式："><a href="#下面列出了最有用的路径表达式：" class="headerlink" title="下面列出了最有用的路径表达式："></a>下面列出了最有用的路径表达式：</h4><table>
<thead>
<tr>
<th align="left">表达式</th>
<th align="left">描述</th>
</tr>
</thead>
<tbody><tr>
<td align="left">nodename</td>
<td align="left">选取此节点的所有子节点。</td>
</tr>
<tr>
<td align="left">&#x2F;</td>
<td align="left">从根节点选取。</td>
</tr>
<tr>
<td align="left">&#x2F;&#x2F;</td>
<td align="left">从匹配选择的当前节点选择文档中的节点，而不考虑它们的位置。</td>
</tr>
<tr>
<td align="left">.</td>
<td align="left">选取当前节点。</td>
</tr>
<tr>
<td align="left">..</td>
<td align="left">选取当前节点的父节点。</td>
</tr>
<tr>
<td align="left">@</td>
<td align="left">选取属性。</td>
</tr>
</tbody></table>
<p><strong>实例</strong></p>
<p>在下面的表格中，我们已列出了一些路径表达式以及表达式的结果：</p>
<table>
<thead>
<tr>
<th align="left">路径表达式</th>
<th align="left">结果</th>
</tr>
</thead>
<tbody><tr>
<td align="left">bookstore</td>
<td align="left">选取 bookstore 元素的所有子节点。</td>
</tr>
<tr>
<td align="left">&#x2F;bookstore</td>
<td align="left">选取根元素 bookstore。注释：假如路径起始于正斜杠( &#x2F; )，则此路径始终代表到某元素的绝对路径！</td>
</tr>
<tr>
<td align="left">bookstore&#x2F;book</td>
<td align="left">选取属于 bookstore 的子元素的所有 book 元素。</td>
</tr>
<tr>
<td align="left">&#x2F;&#x2F;book</td>
<td align="left">选取所有 book 子元素，而不管它们在文档中的位置。</td>
</tr>
<tr>
<td align="left">bookstore&#x2F;&#x2F;book</td>
<td align="left">选择属于 bookstore 元素的后代的所有 book 元素，而不管它们位于 bookstore 之下的什么位置。</td>
</tr>
<tr>
<td align="left">&#x2F;&#x2F;@lang</td>
<td align="left">选取名为 lang 的所有属性。</td>
</tr>
</tbody></table>
<h3 id="谓语（Predicates）"><a href="#谓语（Predicates）" class="headerlink" title="谓语（Predicates）"></a>谓语（Predicates）</h3><p>谓语用来查找某个特定的节点或者包含某个指定的值的节点。</p>
<p>谓语被嵌在方括号中。</p>
<p><strong>实例</strong></p>
<p>在下面的表格中，我们列出了带有谓语的一些路径表达式，以及表达式的结果：</p>
<table>
<thead>
<tr>
<th align="left">路径表达式</th>
<th align="left">结果</th>
</tr>
</thead>
<tbody><tr>
<td align="left">&#x2F;bookstore&#x2F;book[1]</td>
<td align="left">选取属于 bookstore 子元素的第一个 book 元素。</td>
</tr>
<tr>
<td align="left">&#x2F;bookstore&#x2F;book[last()]</td>
<td align="left">选取属于 bookstore 子元素的最后一个 book 元素。</td>
</tr>
<tr>
<td align="left">&#x2F;bookstore&#x2F;book[last()-1]</td>
<td align="left">选取属于 bookstore 子元素的倒数第二个 book 元素。</td>
</tr>
<tr>
<td align="left">&#x2F;bookstore&#x2F;book[position()&lt;3]</td>
<td align="left">选取最前面的两个属于 bookstore 元素的子元素的 book 元素。</td>
</tr>
<tr>
<td align="left">&#x2F;&#x2F;title[@lang]</td>
<td align="left">选取所有拥有名为 lang 的属性的 title 元素。</td>
</tr>
<tr>
<td align="left">&#x2F;&#x2F;title[@lang&#x3D;’eng’]</td>
<td align="left">选取所有 title 元素，且这些元素拥有值为 eng 的 lang 属性。</td>
</tr>
<tr>
<td align="left">&#x2F;bookstore&#x2F;book[price&gt;35.00]</td>
<td align="left">选取 bookstore 元素的所有 book 元素，且其中的 price 元素的值须大于 35.00。</td>
</tr>
<tr>
<td align="left">&#x2F;bookstore&#x2F;book[price&gt;35.00]&#x2F;title</td>
<td align="left">选取 bookstore 元素中的 book 元素的所有 title 元素，且其中的 price 元素的值须大于 35.00。</td>
</tr>
</tbody></table>
<h3 id="选取未知节点"><a href="#选取未知节点" class="headerlink" title="选取未知节点"></a>选取未知节点</h3><p>XPath 通配符可用来选取未知的 XML 元素。</p>
<table>
<thead>
<tr>
<th align="left">通配符</th>
<th align="left">描述</th>
</tr>
</thead>
<tbody><tr>
<td align="left">*</td>
<td align="left">匹配任何元素节点。</td>
</tr>
<tr>
<td align="left">@*</td>
<td align="left">匹配任何属性节点。</td>
</tr>
<tr>
<td align="left">node()</td>
<td align="left">匹配任何类型的节点。</td>
</tr>
</tbody></table>
<p><strong>实例</strong></p>
<p>在下面的表格中，我们列出了一些路径表达式，以及这些表达式的结果：</p>
<table>
<thead>
<tr>
<th align="left">路径表达式</th>
<th align="left">结果</th>
</tr>
</thead>
<tbody><tr>
<td align="left">&#x2F;bookstore&#x2F;*</td>
<td align="left">选取 bookstore 元素的所有子元素。</td>
</tr>
<tr>
<td align="left">&#x2F;&#x2F;*</td>
<td align="left">选取文档中的所有元素。</td>
</tr>
<tr>
<td align="left">&#x2F;&#x2F;title[@*]</td>
<td align="left">选取所有带有属性的 title 元素。</td>
</tr>
</tbody></table>
<h3 id="选取若干路径"><a href="#选取若干路径" class="headerlink" title="选取若干路径"></a>选取若干路径</h3><p>通过在路径表达式中使用“|”运算符，您可以选取若干个路径。</p>
<p><strong>实例</strong></p>
<p>在下面的表格中，我们列出了一些路径表达式，以及这些表达式的结果：</p>
<table>
<thead>
<tr>
<th align="left">路径表达式</th>
<th align="left">结果</th>
</tr>
</thead>
<tbody><tr>
<td align="left">&#x2F;&#x2F;book&#x2F;title | &#x2F;&#x2F;book&#x2F;price</td>
<td align="left">选取 book 元素的所有 title 和 price 元素。</td>
</tr>
<tr>
<td align="left">&#x2F;&#x2F;title | &#x2F;&#x2F;price</td>
<td align="left">选取文档中的所有 title 和 price 元素。</td>
</tr>
<tr>
<td align="left">&#x2F;bookstore&#x2F;book&#x2F;title | &#x2F;&#x2F;price</td>
<td align="left">选取属于 bookstore 元素的 book 元素的所有 title 元素，以及文档中所有的 price 元素。</td>
</tr>
</tbody></table>
<h2 id="参考"><a href="#参考" class="headerlink" title="参考"></a>参考</h2><ul>
<li><p><a href="/go.html?u=https://www.w3school.com.cn/xpath/xpath_syntax.asp">XPath 语法</a></p>
</li>
<li><p><a href="/go.html?u=https://bbs.csdn.net/topics/391842853">求教！！Python xpath如何获取带注释的text？-CSDN论坛</a></p>
</li>
<li><p><a href="/go.html?u=https://www.jb51.net/article/154569.htm">python xpath获取页面注释的方法_python_脚本之家</a></p>
</li>
</ul>

    </div>

    
    
    

    <footer class="post-footer">
          <div class="post-tags">
              <a href="/tags/xpath/" rel="tag"># xpath</a>
              <a href="/tags/lxml/" rel="tag"># lxml</a>
          </div>

        

          <div class="post-nav">
            <div class="post-nav-item">
                <a href="/post/20191031-netERR_BLOCKED_BY_CLIENT/" rel="prev" title="netERR_BLOCKED_BY_CLIENT">
                  <i class="fa fa-chevron-left"></i> netERR_BLOCKED_BY_CLIENT
                </a>
            </div>
            <div class="post-nav-item">
                <a href="/post/20191108-selenium_an_zhuang_he_Chromedriver/" rel="next" title="selenium安装和Chromedriver">
                  selenium安装和Chromedriver <i class="fa fa-chevron-right"></i>
                </a>
            </div>
          </div>
    </footer>
  </article>
</div>






    
  <div class="comments" id="disqus_thread">
    <noscript>Please enable JavaScript to view the comments powered by Disqus.</noscript>
  </div>
  
</div>
  </main>

  <footer class="footer">
    <div class="footer-inner">


<div class="copyright">
  &copy; 
  <span itemprop="copyrightYear">2024</span>
  <span class="with-love">
    <i class="fa fa-heart"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">JackCh3n</span>
</div>
<div class="busuanzi-count">
    <span class="post-meta-item" id="busuanzi_container_site_uv">
      <span class="post-meta-item-icon">
        <i class="fa fa-user"></i>
      </span>
      <span class="site-uv" title="Total Visitors">
        <span id="busuanzi_value_site_uv"></span>
      </span>
    </span>
    <span class="post-meta-item" id="busuanzi_container_site_pv">
      <span class="post-meta-item-icon">
        <i class="fa fa-eye"></i>
      </span>
      <span class="site-pv" title="Total Views">
        <span id="busuanzi_value_site_pv"></span>
      </span>
    </span>
</div>
  <div class="powered-by">Powered by <a href="https://hexo.io/" rel="noopener" target="_blank">Hexo</a> & <a href="https://theme-next.js.org/" rel="noopener" target="_blank">NexT.Gemini</a>
  </div>
    <br/>Powerful CDN service provided by 
  <a target="_blank" rel="noopener" href="https://www.upyun.com/?utm_source=lianmeng&utm_medium=referral" style="border:none"><img src="https://www.wulihub.com.cn/gc/Wez0VG/images/upyun_logo.png" height="66" style="display: list-item;" alt="upyun"></a> 
  <a target="_blank" rel="noopener" href="https://www.jsdelivr.com" style="border:none"><img src="https://www.wulihub.com.cn/gc/Wez0VG/images/jsdelivr_logo.svg" height="50" style="display: list-item;" alt="JsDelivr"></a>
  <p>
  <br/><a target="_blank" rel="noopener" href="https://pages.github.com/?f=nobige">Github Pages</a>&nbsp;&nbsp;<a target="_blank" rel="noopener" href="https://vercel.com/?f=nobige">Vercel</a>&nbsp;&nbsp;<a target="_blank" rel="noopener" href="https://www.netlify.com/?f=nobige">Netlify</a>&nbsp;&nbsp;<a target="_blank" rel="noopener" href="https://coding.net/?f=nobige">coding</a>&nbsp;&nbsp;provides static publishing
  </p>

    </div>
  </footer>

  
  <script src="https://npm.elemecdn.com/animejs@3.2.1/lib/anime.min.js" integrity="sha256-XL2inqUJaslATFnHdJOi9GfQ60on8Wx1C2H8DYiN1xY=" crossorigin="anonymous"></script>
<script src="https://www.wulihub.com.cn/gc/Wez0VG/js/comments.js"></script><script src="https://www.wulihub.com.cn/gc/Wez0VG/js/utils.js"></script><script src="https://www.wulihub.com.cn/gc/Wez0VG/js/motion.js"></script><script src="https://www.wulihub.com.cn/gc/Wez0VG/js/next-boot.js"></script>

  
<script src="https://npm.elemecdn.com/hexo-generator-searchdb@1.4.0/dist/search.js" integrity="sha256-vXZMYLEqsROAXkEw93GGIvaB2ab+QW6w3+1ahD9nXXA=" crossorigin="anonymous"></script>
<script src="https://www.wulihub.com.cn/gc/Wez0VG/js/third-party/search/local-search.js"></script>





  
  <script async src="https://busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>




<link rel="stylesheet" href="https://npm.elemecdn.com/disqusjs@1.3.0/dist/disqusjs.css" integrity="sha256-GxdCIOyfxQ1OBfS99qAIJDoGK1ADuBsxhMTqXG82fAY=" crossorigin="anonymous">

<script class="next-config" data-name="disqusjs" type="application/json">{"enable":true,"api":"https://duosuo.nobige.cn/","apikey":"Kmo9kPwsir7ppp8tGpeGROCmUtW8tdZBOdBVowPC67xVfTS5uV1wr6s4kckvP95m","shortname":"nobige","js":{"url":"https://npm.elemecdn.com/disqusjs@1.3.0/dist/disqus.js","integrity":"sha256-LVaMHPQ2zLqOc5rXSAfr4d1PIkEGNLyyUTDNPZmTtUw="}}</script>
<script src="https://www.wulihub.com.cn/gc/Wez0VG/js/third-party/comments/disqusjs.js"></script>

</body>
</html>
